In [1]:
import numpy as np
import tensorflow as tf
from openpyxl import load_workbook
from collections import namedtuple
import time
Load data from xlsx file. I loaded xlsx file and split it into inputs, labels. Finally, I also split inputs to generate more training datas.
In [2]:
# Load data from xlsx file
wb = load_workbook('skill_map_data.xlsx')
## print(wb.get_sheet_names())
ws = wb.get_sheet_by_name('raw data - Chapter and Text')
raw_data = []
for row in ws.iter_rows():
raw_data_row = {
"week_day" : row[0].value,
"chapter" : row[1].value,
"lesson" : row[2].value,
"section" : row[3].value,
"text" : row[4].value
}
raw_data.append(raw_data_row)
raw_data = raw_data[2:] # remove table name and header
assert(len(raw_data) < 100) # normally we don't have 100+ sections
# Split raw_data into inputs and labels
inputs = [row['text'] for row in raw_data]
assert(len(raw_data) == len(inputs))
## concated week_day, chapter, lesson, section into one label
labels = [' '.join([
str(row['week_day']), ' ',
row['chapter'], ' ',
row['lesson'], ' ',
row['section']
]) for row in raw_data]
assert(len(raw_data) == len(labels))
# Split inputs to generate more training datas
seq_len = 100 # length for split long text
seq_inputs = []
seq_labels = []
count = 0
for i, input in enumerate(inputs):
if len(input) > seq_len:
for j in range(int(len(input)/seq_len + 0.5)):
seq_input = input[j*seq_len:(j+1)*seq_len]
seq_inputs.append(seq_input)
seq_labels.append(labels[i])
count += 1
else:
seq_inputs.append(input)
seq_labels.append(labels[i])
len(seq_inputs), len(seq_labels)
# seq_labels[998], seq_inputs[998]
Out[2]:
In [3]:
inputs = seq_inputs
labels = seq_labels
In [4]:
inputs[:5]
Out[4]:
In [5]:
from string import punctuation
all_text = ''.join([c for c in inputs if c not in punctuation])
all_text = ' '.join(inputs)
words = all_text.split()
In [6]:
len(words), len(all_text), len(inputs)
Out[6]:
In [7]:
all_text[:200]
Out[7]:
In [8]:
words[:10]
Out[8]:
In [9]:
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
inputs_ints = []
for each in inputs:
inputs_ints.append([vocab_to_int[word] for word in each.split()])
In [10]:
labels_set = set(labels)
int_to_label = dict(enumerate(labels_set))
label_to_int = {l: i for i, l in enumerate(labels_set)}
seq_len = 100
labels = [[label_to_int[L]] * seq_len for L in labels]
labels = np.array(labels, dtype=np.int32)
In [11]:
# test encoded labels
test_index = 6
test_label = int_to_label[test_index]
assert(test_index == label_to_int[test_label])
assert(len(inputs) == len(labels))
labels[:1], labels.shape
Out[11]:
Now, create an array features that contains the data we'll pass to the network. The data should come from review_ints, since we want to feed integers to the network. Each row should be 200 elements long. For reviews shorter than 200 words, left pad with 0s. That is, if the review is ['best', 'movie', 'ever'], [117, 18, 128] as integers, the row will look like [0, 0, 0, ..., 0, 117, 18, 128]. For reviews longer than 200, use on the first 200 words as the feature vector.
In [12]:
# Filter out that inputs with 0 length
inputs_ints = [each for each in inputs_ints if len(each) > 0]
In [13]:
seq_len = 100
features = np.zeros((len(inputs), seq_len), dtype=int)
for i, row in enumerate(inputs_ints):
features[i, -len(row):] = np.array(row)[:seq_len]
In [14]:
features.shape
features[0]
Out[14]:
In [15]:
split_frac= 0.8
split_idx = int(len(features)*0.8)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]
test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
"\nValidation set: \t{}".format(val_x.shape),
"\nTest set: \t\t{}".format(test_x.shape))
In [16]:
def build_rnn(num_classes, batch_size=50, num_steps=50, lstm_size=128, num_layers=2,
learning_rate=0.001, grad_clip=5, sampling=False):
# When we're using this network for sampling later, we'll be passing in
# one character at a time, so providing an option for that
if sampling == True:
batch_size, num_steps = 1, 1
tf.reset_default_graph()
# Declare placeholders we'll feed into the graph
inputs = tf.placeholder(tf.int32, [batch_size, num_steps], name='inputs')
targets = tf.placeholder(tf.int32, [batch_size, num_steps], name='targets')
# Keep probability placeholder for drop out layers
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
# One-hot encoding the input and target characters
x_one_hot = tf.one_hot(inputs, num_classes)
y_one_hot = tf.one_hot(targets, num_classes)
### Build the RNN layers
# Use a basic LSTM cell
lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
# Add dropout to the cell
drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
# Stack up multiple LSTM layers, for deep learning
cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)
initial_state = cell.zero_state(batch_size, tf.float32)
### Run the data through the RNN layers
# This makes a list where each element is on step in the sequence
rnn_inputs = [tf.squeeze(i, squeeze_dims=[1]) for i in tf.split(x_one_hot, num_steps, 1)]
# Run each sequence step through the RNN and collect the outputs
outputs, state = tf.contrib.rnn.static_rnn(cell, rnn_inputs, initial_state=initial_state)
final_state = state
# Reshape output so it's a bunch of rows, one output row for each step for each batch
seq_output = tf.concat(outputs, axis=1)
output = tf.reshape(seq_output, [-1, lstm_size])
# Now connect the RNN outputs to a softmax layer
with tf.variable_scope('softmax'):
softmax_w = tf.Variable(tf.truncated_normal((lstm_size, num_classes), stddev=0.1))
softmax_b = tf.Variable(tf.zeros(num_classes))
# Since output is a bunch of rows of RNN cell outputs, logits will be a bunch
# of rows of logit outputs, one for each step and batch
logits = tf.matmul(output, softmax_w) + softmax_b
# Use softmax to get the probabilities for predicted characters
preds = tf.nn.softmax(logits, name='predictions')
# Reshape the targets to match the logits
y_reshaped = tf.reshape(y_one_hot, [-1, num_classes])
loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped)
cost = tf.reduce_mean(loss)
# Optimizer for training, using gradient clipping to control exploding gradients
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip)
train_op = tf.train.AdamOptimizer(learning_rate)
optimizer = train_op.apply_gradients(zip(grads, tvars))
# Export the nodes
# NOTE: I'm using a namedtuple here because I think they are cool
export_nodes = ['inputs', 'targets', 'initial_state', 'final_state',
'keep_prob', 'cost', 'preds', 'optimizer']
Graph = namedtuple('Graph', export_nodes)
local_dict = locals()
graph = Graph(*[local_dict[each] for each in export_nodes])
return graph
In [17]:
def get_batches(x, y, batch_size=100):
n_batches = len(x)//batch_size
x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
for ii in range(0, len(x), batch_size):
yield x[ii:ii+batch_size], y[ii:ii+batch_size]
In [18]:
test = get_batches(train_x, train_y, batch_size=100)
first = list(test)[0]
len(first[0])
Out[18]:
In [19]:
batch_size = 100
num_steps = 100
lstm_size = 256
num_layers = 2
learning_rate = 0.001
keep_prob = 0.5
In [20]:
epochs = 20
# Save every N iterations
save_every_n = 15
num_classes = len(labels)
model = build_rnn(num_classes,
batch_size=batch_size,
num_steps=num_steps,
learning_rate=learning_rate,
lstm_size=lstm_size,
num_layers=num_layers)
saver = tf.train.Saver(max_to_keep=100)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# Use the line below to load a checkpoint and resume training
#saver.restore(sess, 'checkpoints/______.ckpt')
n_batches = int(train_x.shape[1]/num_steps)
iterations = n_batches * epochs
for e in range(epochs):
# Train network
new_state = sess.run(model.initial_state)
loss = 0
for b, (x, y) in enumerate(get_batches(train_x, train_y, num_steps), 1):
iteration = e*n_batches + b
start = time.time()
feed = {model.inputs: x,
model.targets: y,
model.keep_prob: keep_prob,
model.initial_state: new_state}
batch_loss, new_state, _ = sess.run([model.cost, model.final_state, model.optimizer],
feed_dict=feed)
loss += batch_loss
end = time.time()
print('Epoch {}/{} '.format(e+1, epochs),
'Iteration {}/{}'.format(iteration, iterations),
'Training loss: {:.4f}'.format(loss/b),
'{:.4f} sec/batch'.format((end-start)))
if (iteration%save_every_n == 0) or (iteration == iterations):
# Check performance, notice dropout has been set to 1
val_loss = []
new_state = sess.run(model.initial_state)
for x, y in get_batches(val_x, val_y, num_steps):
feed = {model.inputs: x,
model.targets: y,
model.keep_prob: 1.,
model.initial_state: new_state}
batch_loss, new_state = sess.run([model.cost, model.final_state], feed_dict=feed)
val_loss.append(batch_loss)
print('Validation loss:', np.mean(val_loss),
'Saving checkpoint!')
saver.save(sess, "checkpoints/i{}_l{}_v{:.3f}.ckpt".format(iteration, lstm_size, np.mean(val_loss)))
In [ ]: